Chapter 5 Alpha diversity

5.1 Load data

Load the data produced in the 2nd chapter

load("data/data.Rdata")
quality <- read_tsv("results/final_combined_stats.tsv",
  col_types = cols_only(microsample = col_character(), quality = col_double()),
  show_col_types = FALSE
)

Define function to estimate alpha diversity measurements.

calculate_alpha_diversity <- function(input_data, dataset_name) {
  # Step 1: Transform the input data (remove rownames if needed)
  input_data_matrix <- input_data %>%
    column_to_rownames(var = "genome")

  # Step 2: Calculate richness (q = 0)
  richness <- hilldiv(input_data_matrix, q = 0) %>%
    t() %>%
    as.data.frame() %>%
    rename(richness = 1) %>%
    rownames_to_column(var = "microsample")

  # Step 3: Calculate neutral diversity (q = 1)
  neutral <- hilldiv(input_data_matrix, q = 1) %>%
    t() %>%
    as.data.frame() %>%
    rename(neutral = 1) %>%
    rownames_to_column(var = "microsample")

  # Step 4: Calculate phylogenetic diversity (q = 1, with genome tree)
  phylogenetic <- hilldiv(input_data_matrix, q = 1, tree = genome_tree) %>%
    t() %>%
    as.data.frame() %>%
    rename(phylogenetic = 1) %>%
    rownames_to_column(var = "microsample")

  # Step 5: Merge all diversity metrics
  alpha_diversity <- richness %>%
    full_join(neutral, by = "microsample") %>%
    full_join(phylogenetic, by = "microsample") %>%
    left_join(sample_metadata, by = "microsample")

  # Step 6: Define the output file name based on the dataset name
  output_filename <- paste0("results/alpha_div_", dataset_name, ".tsv")

  # Step 7: Write the result to a tsv file
  alpha_diversity %>% write_tsv(output_filename)

  # Return the alpha_diversity data frame
  return(alpha_diversity)
}

Estimate the alpha diversity on the unfiltered and the coverage-filtered counts

# Calculate alpha diversity for filtered genome counts data
alpha_div_filtered <- calculate_alpha_diversity(
  input_data = genome_counts_filt,
  dataset_name = "filtered"
)
# Calculate alpha diversity for unfiltered genome counts data
alpha_div_unfiltered <- calculate_alpha_diversity(
  input_data = genome_counts,
  dataset_name = "unfiltered"
)
plot_alpha_diversity <- function(alpha_div_dataset, plot_base_name, plot_params_list, filter_type = NULL) {
  # Initialize a list to store the plots
  plots_list <- list()

  # Loop through each set of plot parameters and generate plots
  for (param_name in names(plot_params_list)) {
    plot_params <- plot_params_list[[param_name]]

    # Apply filters based on plot_params and filter_type if provided
    filtered_data <- alpha_div_dataset

    # Dynamically apply filter conditions1
    if (length(plot_params$filter_conditions) > 0) {
      filtered_data <- filtered_data %>% filter(!!!plot_params$filter_conditions)
    }

    # Dynamically apply the type filter (positive/negative) if provided
    if (!is.null(filter_type)) {
      filtered_data <- filtered_data %>% filter(type %in% filter_type)
    }

    # Conditionally apply factor() for size based on the presence of 'size' in plot_params$fill_var
    if (grepl("size", plot_params$fill_var)) {
      filtered_data <- filtered_data %>%
        mutate(size = factor(size, levels = c(500, 1500, 2500, 5000, 25000, 50000)))
    }

    # Pivot the relevant columns for diversity metrics
    filtered_data <- filtered_data %>%
      pivot_longer(
        cols = c(richness, neutral, phylogenetic),
        names_to = "metric",
        values_to = "value"
      ) %>%
      left_join(quality, by = join_by(microsample == microsample)) %>%
      mutate(metric = factor(metric, levels = c("richness", "neutral", "phylogenetic"))) %>%
      filter(!is.na(value)) # Filter out rows with NA values in the value column

    new_facet_formula <- gsub("(.*) ~ \\.", "metric ~ \\1", plot_params$facet_formula)

    # Create the plot
    plot <- ggplot(filtered_data, aes(x = plot_params$fill_var, y = value, color = quality)) +
      scale_color_gradient(low = "#c90076", high = "#3598bf", name = "Quality", limits = c(0, 5)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(width = 0.2) +
      facet_nested(as.formula(new_facet_formula), scales = "free", space = "fixed") +
      theme_minimal() +
      custom_ggplot_theme +
      labs(
        title = plot_params$plot_title,
        x = stat_params$fill_var
      )

    # Store the plot in the list
    plots_list[[param_name]] <- plot

    # Print the plot
    print(plot)

    # Save the plot as an image file
    ggsave(
      filename = paste0("results/figures/alpha_diversity_plots/", plot_base_name, param_name, ".jpg"),
      plot = plot,
      device = "jpg",
      width = 30,
      height = 30,
      units = "cm",
      dpi = 300,
      limitsize = FALSE
    )
  }

  # Return the list of plots
  return(plots_list)
}

Coverage-filtered data, positive samples

alpha_div_plots <- plot_alpha_diversity(
  alpha_div_dataset = alpha_div_filtered,
  plot_base_name = "alpha_div_filt_pos_",
  plot_params_list = plot_params_list,
  filter_type = c("Positive")
)

Coverage-filtered data, all samples

alpha_div_plots <- plot_alpha_diversity(
  alpha_div_dataset = alpha_div_filtered,
  plot_base_name = "alpha_div_filt_all_",
  plot_params_list = plot_params_list,
  filter_type = NULL
)

Unfiltered data, positive samples

alpha_div_plots <- plot_alpha_diversity(
  alpha_div_dataset = alpha_div_unfiltered,
  plot_base_name = "alpha_div_unfilt_pos_",
  plot_params_list = plot_params_list,
  filter_type = c("Positive")
)

Unfiltered data, all samples

alpha_div_plots <- plot_alpha_diversity(
  alpha_div_dataset = alpha_div_unfiltered,
  plot_base_name = "alpha_div_unfilt_all_",
  plot_params_list = plot_params_list,
  filter_type = NULL
)